{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import twitterloader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "load word2vec finished\n"
     ]
    }
   ],
   "source": [
    "twitterloader.load_word2vec(\"/home/hadoop/word2vec.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max_sent: 210 ,  max_seq_len: 1999\n",
      "4664 data loaded\n"
     ]
    }
   ],
   "source": [
    "twitterloader.load_data_fast()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from twitterloader import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "x, l, y = get_df_batch(0, 20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "### Twitter 数据按照事件拆分训练验证集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "twt_dir = \"/home/hadoop/pheme-rnr-dataset/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "dirs = os.listdir(twt_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'README'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dirs.pop(-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['germanwings-crash',\n",
       " 'ottawashooting',\n",
       " 'sydneysiege',\n",
       " 'ferguson',\n",
       " 'charliehebdo']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dirs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def list_files(path):\n",
    "    fs = os.listdir(path)\n",
    "    for f1 in fs:\n",
    "        tmp_path = os.path.join(path, f1)\n",
    "        if not os.path.isdir(tmp_path):\n",
    "            if tmp_path.split('.')[-1] == 'json':\n",
    "                files.append(tmp_path)\n",
    "        else:\n",
    "            list_files(tmp_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "all_file_list = []\n",
    "for i in range(5):\n",
    "    files = []\n",
    "    list_files(os.path.join(twt_dir , dirs[i]))\n",
    "    all_file_list.append(list(filter(lambda f:\"source-tweet\" in f, files)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[469, 890, 1221, 1143, 2079]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[len(l) for l in all_file_list]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "> 可以从第二个事件中拆分出验证集和测试集"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "#### 拆分ottawashooting事件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524976773671292928/source-tweet/524976773671292928.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524944544681705472/source-tweet/524944544681705472.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524922729485848576/source-tweet/524922729485848576.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524950887119482882/source-tweet/524950887119482882.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524947867975561216/source-tweet/524947867975561216.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524982066564567040/source-tweet/524982066564567040.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524981689781870593/source-tweet/524981689781870593.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524949339131904000/source-tweet/524949339131904000.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524947964620701696/source-tweet/524947964620701696.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525066876058361856/source-tweet/525066876058361856.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524931830173421568/source-tweet/524931830173421568.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525037656820826112/source-tweet/525037656820826112.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524950899899113473/source-tweet/524950899899113473.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524931019745812480/source-tweet/524931019745812480.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524936365427806208/source-tweet/524936365427806208.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524955620538343426/source-tweet/524955620538343426.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524989654995849216/source-tweet/524989654995849216.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524948206023880704/source-tweet/524948206023880704.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525046443103354880/source-tweet/525046443103354880.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524969144832491520/source-tweet/524969144832491520.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524928195011698688/source-tweet/524928195011698688.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524974885362479104/source-tweet/524974885362479104.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524944006577668096/source-tweet/524944006577668096.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525013278343581696/source-tweet/525013278343581696.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524983050187579393/source-tweet/524983050187579393.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524947502903345153/source-tweet/524947502903345153.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525060425184858112/source-tweet/525060425184858112.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525020620636307456/source-tweet/525020620636307456.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524968205690089472/source-tweet/524968205690089472.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524924346411020288/source-tweet/524924346411020288.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524956294395224064/source-tweet/524956294395224064.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524955062486196224/source-tweet/524955062486196224.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524935758143324160/source-tweet/524935758143324160.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525033357722550272/source-tweet/525033357722550272.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525039876366798849/source-tweet/525039876366798849.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525059348574130176/source-tweet/525059348574130176.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524981513637888000/source-tweet/524981513637888000.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524925215235911680/source-tweet/524925215235911680.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524987851046658048/source-tweet/524987851046658048.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524953017389707264/source-tweet/524953017389707264.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524981436252950528/source-tweet/524981436252950528.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525002842021445632/source-tweet/525002842021445632.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524952437137092608/source-tweet/524952437137092608.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524964948683005952/source-tweet/524964948683005952.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524925987239120897/source-tweet/524925987239120897.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524958599433691136/source-tweet/524958599433691136.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524964989975937024/source-tweet/524964989975937024.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525024066152169473/source-tweet/525024066152169473.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524966582544781313/source-tweet/524966582544781313.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524958128392376320/source-tweet/524958128392376320.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524969501704855552/source-tweet/524969501704855552.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524937204552843268/source-tweet/524937204552843268.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525055052801515521/source-tweet/525055052801515521.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524926301678096384/source-tweet/524926301678096384.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524947424482435073/source-tweet/524947424482435073.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524982531075371008/source-tweet/524982531075371008.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525039242771464193/source-tweet/525039242771464193.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525033625788907520/source-tweet/525033625788907520.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524975552638504960/source-tweet/524975552638504960.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524971492820647936/source-tweet/524971492820647936.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525027651908800512/source-tweet/525027651908800512.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524977651476623360/source-tweet/524977651476623360.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524966392702189569/source-tweet/524966392702189569.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524962142563610625/source-tweet/524962142563610625.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524993533212897281/source-tweet/524993533212897281.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525058976376193024/source-tweet/525058976376193024.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525011494896148480/source-tweet/525011494896148480.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524954028287877121/source-tweet/524954028287877121.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524960247518330881/source-tweet/524960247518330881.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524923293711998976/source-tweet/524923293711998976.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525032520124207104/source-tweet/525032520124207104.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525036214286815232/source-tweet/525036214286815232.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525029433632366592/source-tweet/525029433632366592.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524970097711267841/source-tweet/524970097711267841.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524985687704338432/source-tweet/524985687704338432.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525029913230053377/source-tweet/525029913230053377.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524959630158725120/source-tweet/524959630158725120.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525039686931075072/source-tweet/525039686931075072.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524982608716120065/source-tweet/524982608716120065.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525026715123601408/source-tweet/525026715123601408.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525046462460477440/source-tweet/525046462460477440.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524972443308683264/source-tweet/524972443308683264.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524964223185850368/source-tweet/524964223185850368.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524951419175309312/source-tweet/524951419175309312.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524954623145431041/source-tweet/524954623145431041.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525055879545380864/source-tweet/525055879545380864.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524961408740380672/source-tweet/524961408740380672.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524950404283772930/source-tweet/524950404283772930.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525029954908860417/source-tweet/525029954908860417.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524947416869388288/source-tweet/524947416869388288.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525026592788348929/source-tweet/525026592788348929.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524943063937208320/source-tweet/524943063937208320.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524976963862429697/source-tweet/524976963862429697.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525021132022628353/source-tweet/525021132022628353.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525048987649576960/source-tweet/525048987649576960.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524945170777645059/source-tweet/524945170777645059.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524927281048080385/source-tweet/524927281048080385.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524948264886759424/source-tweet/524948264886759424.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524987107081977856/source-tweet/524987107081977856.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524969483317026816/source-tweet/524969483317026816.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524936793633083394/source-tweet/524936793633083394.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524964929083023360/source-tweet/524964929083023360.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524943012602716160/source-tweet/524943012602716160.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524926768054956032/source-tweet/524926768054956032.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524957429889777665/source-tweet/524957429889777665.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524949837729787905/source-tweet/524949837729787905.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525027873867198465/source-tweet/525027873867198465.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524945684966166528/source-tweet/524945684966166528.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525047767325220865/source-tweet/525047767325220865.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524952883343925249/source-tweet/524952883343925249.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525023912221212672/source-tweet/525023912221212672.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524949073154301952/source-tweet/524949073154301952.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524949828711620608/source-tweet/524949828711620608.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525067386849091584/source-tweet/525067386849091584.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524978526974668800/source-tweet/524978526974668800.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524984773211860994/source-tweet/524984773211860994.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524977321430630400/source-tweet/524977321430630400.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524975642379816960/source-tweet/524975642379816960.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524994809912504321/source-tweet/524994809912504321.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524932327907270656/source-tweet/524932327907270656.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525022700923019264/source-tweet/525022700923019264.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524959332656349187/source-tweet/524959332656349187.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525030696738652160/source-tweet/525030696738652160.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525029016001318913/source-tweet/525029016001318913.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524946921547628545/source-tweet/524946921547628545.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524967727266398209/source-tweet/524967727266398209.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525035552643751936/source-tweet/525035552643751936.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524975093966209025/source-tweet/524975093966209025.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524965723672961024/source-tweet/524965723672961024.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525067500447612929/source-tweet/525067500447612929.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524951585231994880/source-tweet/524951585231994880.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525018799347875841/source-tweet/525018799347875841.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524941132237910016/source-tweet/524941132237910016.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524976929703997441/source-tweet/524976929703997441.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524961362448240642/source-tweet/524961362448240642.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524950264303075328/source-tweet/524950264303075328.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525062013551001600/source-tweet/525062013551001600.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524943812339437569/source-tweet/524943812339437569.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525007664544493569/source-tweet/525007664544493569.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524935769614331904/source-tweet/524935769614331904.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524976980799000578/source-tweet/524976980799000578.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525025279803424768/source-tweet/525025279803424768.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524975847846203393/source-tweet/524975847846203393.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524948866773184512/source-tweet/524948866773184512.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524969201102901248/source-tweet/524969201102901248.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524962199543230464/source-tweet/524962199543230464.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524926235030589440/source-tweet/524926235030589440.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524957566502445056/source-tweet/524957566502445056.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524983325275197440/source-tweet/524983325275197440.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524983366261936130/source-tweet/524983366261936130.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524991844292521984/source-tweet/524991844292521984.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525030117014507523/source-tweet/525030117014507523.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524979668282519552/source-tweet/524979668282519552.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524978172110987264/source-tweet/524978172110987264.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524940014065811456/source-tweet/524940014065811456.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524955159840182272/source-tweet/524955159840182272.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525061787860103169/source-tweet/525061787860103169.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524974456851410944/source-tweet/524974456851410944.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525036457732231169/source-tweet/525036457732231169.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524944399890124801/source-tweet/524944399890124801.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524923462398513152/source-tweet/524923462398513152.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524946179390701568/source-tweet/524946179390701568.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524955557686284288/source-tweet/524955557686284288.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525042177643016192/source-tweet/525042177643016192.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525022663102963712/source-tweet/525022663102963712.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524943721302089728/source-tweet/524943721302089728.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524962748237889536/source-tweet/524962748237889536.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525027539845410816/source-tweet/525027539845410816.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524949628689461249/source-tweet/524949628689461249.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525025806079504385/source-tweet/525025806079504385.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525049831312875521/source-tweet/525049831312875521.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525010245811466240/source-tweet/525010245811466240.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524934673760530432/source-tweet/524934673760530432.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525067419635576834/source-tweet/525067419635576834.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524934827909210112/source-tweet/524934827909210112.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524990180034609152/source-tweet/524990180034609152.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524937542131793920/source-tweet/524937542131793920.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524965786059026432/source-tweet/524965786059026432.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524931752725585921/source-tweet/524931752725585921.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524990163446140928/source-tweet/524990163446140928.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524945197705461763/source-tweet/524945197705461763.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524946609453678593/source-tweet/524946609453678593.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524949443607412737/source-tweet/524949443607412737.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524933657493250048/source-tweet/524933657493250048.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524948268355051520/source-tweet/524948268355051520.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525046432085327872/source-tweet/525046432085327872.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524980579956449281/source-tweet/524980579956449281.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524996383120818176/source-tweet/524996383120818176.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524926472432410625/source-tweet/524926472432410625.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525052332116312064/source-tweet/525052332116312064.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524930538009006081/source-tweet/524930538009006081.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524935216134369281/source-tweet/524935216134369281.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525003468659228672/source-tweet/525003468659228672.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524956129017995264/source-tweet/524956129017995264.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524957752138149888/source-tweet/524957752138149888.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524985325190656000/source-tweet/524985325190656000.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525002243100401664/source-tweet/525002243100401664.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524937968138878976/source-tweet/524937968138878976.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525017500061536258/source-tweet/525017500061536258.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524929769469538305/source-tweet/524929769469538305.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524968337747767298/source-tweet/524968337747767298.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524991576163250176/source-tweet/524991576163250176.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524935791794221057/source-tweet/524935791794221057.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524941727753568257/source-tweet/524941727753568257.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525038517844709376/source-tweet/525038517844709376.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524935485370929152/source-tweet/524935485370929152.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524969963850051584/source-tweet/524969963850051584.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525031558949134336/source-tweet/525031558949134336.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525051365195014144/source-tweet/525051365195014144.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525023057526947840/source-tweet/525023057526947840.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524977285133524993/source-tweet/524977285133524993.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525025552634494976/source-tweet/525025552634494976.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524945879455653889/source-tweet/524945879455653889.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525057374965739520/source-tweet/525057374965739520.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525032872647065600/source-tweet/525032872647065600.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524948941956448257/source-tweet/524948941956448257.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524940716733370374/source-tweet/524940716733370374.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524937330923417600/source-tweet/524937330923417600.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524949439367380993/source-tweet/524949439367380993.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524926279309860864/source-tweet/524926279309860864.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524952154482946048/source-tweet/524952154482946048.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524988712367955970/source-tweet/524988712367955970.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524924225463668738/source-tweet/524924225463668738.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524948554344054785/source-tweet/524948554344054785.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525027317551079424/source-tweet/525027317551079424.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524967711492014080/source-tweet/524967711492014080.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524980744658382848/source-tweet/524980744658382848.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524928119955013632/source-tweet/524928119955013632.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525019752507658240/source-tweet/525019752507658240.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524947674164760577/source-tweet/524947674164760577.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525040545870385153/source-tweet/525040545870385153.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524937594090844160/source-tweet/524937594090844160.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524949733983657984/source-tweet/524949733983657984.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524927356830760961/source-tweet/524927356830760961.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524953110285152256/source-tweet/524953110285152256.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525035564014526464/source-tweet/525035564014526464.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524977076433321987/source-tweet/524977076433321987.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524949026064850944/source-tweet/524949026064850944.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524933380929245184/source-tweet/524933380929245184.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524935633463037953/source-tweet/524935633463037953.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524923341359300608/source-tweet/524923341359300608.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524932056560963584/source-tweet/524932056560963584.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524935816603107328/source-tweet/524935816603107328.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524966904885428226/source-tweet/524966904885428226.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524966770428243968/source-tweet/524966770428243968.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524979133366165504/source-tweet/524979133366165504.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525030416261337089/source-tweet/525030416261337089.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524953391634866176/source-tweet/524953391634866176.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525065430713790465/source-tweet/525065430713790465.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525040509937795072/source-tweet/525040509937795072.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524938107142283264/source-tweet/524938107142283264.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524947196131966976/source-tweet/524947196131966976.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525021083892994049/source-tweet/525021083892994049.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525025329916948481/source-tweet/525025329916948481.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524964564018790400/source-tweet/524964564018790400.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524947870735429632/source-tweet/524947870735429632.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524967298604359684/source-tweet/524967298604359684.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525056576038518785/source-tweet/525056576038518785.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524981223291383809/source-tweet/524981223291383809.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524929893759737856/source-tweet/524929893759737856.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524972166006464512/source-tweet/524972166006464512.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524928878075457536/source-tweet/524928878075457536.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525008613392515073/source-tweet/525008613392515073.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525024181218725888/source-tweet/525024181218725888.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524970966842376192/source-tweet/524970966842376192.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524936592159698944/source-tweet/524936592159698944.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524951625556066304/source-tweet/524951625556066304.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524975705206304769/source-tweet/524975705206304769.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524925124303396864/source-tweet/524925124303396864.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524995771587108864/source-tweet/524995771587108864.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524954586696933377/source-tweet/524954586696933377.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524927288077746176/source-tweet/524927288077746176.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524972277406773248/source-tweet/524972277406773248.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524969878823137280/source-tweet/524969878823137280.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525041091507388416/source-tweet/525041091507388416.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524935345998422016/source-tweet/524935345998422016.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524939378552864768/source-tweet/524939378552864768.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524923610851729408/source-tweet/524923610851729408.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525038096086499328/source-tweet/525038096086499328.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525028734991343617/source-tweet/525028734991343617.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524959836778536961/source-tweet/524959836778536961.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524987432656453632/source-tweet/524987432656453632.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525047349757100034/source-tweet/525047349757100034.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524980219749597185/source-tweet/524980219749597185.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524940030109036544/source-tweet/524940030109036544.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525023610243928064/source-tweet/525023610243928064.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524931324763992064/source-tweet/524931324763992064.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524961659685588995/source-tweet/524961659685588995.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525021358879936512/source-tweet/525021358879936512.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524925033626738688/source-tweet/524925033626738688.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524925050739490816/source-tweet/524925050739490816.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524935246647926784/source-tweet/524935246647926784.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524974779540197376/source-tweet/524974779540197376.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525023642007371776/source-tweet/525023642007371776.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524929106987991040/source-tweet/524929106987991040.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525009850104037376/source-tweet/525009850104037376.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524953918166794240/source-tweet/524953918166794240.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524974975623892992/source-tweet/524974975623892992.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524970731202166784/source-tweet/524970731202166784.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524954630279925760/source-tweet/524954630279925760.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524938857180303360/source-tweet/524938857180303360.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524952094986350592/source-tweet/524952094986350592.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524961721744900097/source-tweet/524961721744900097.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524938433262006272/source-tweet/524938433262006272.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524932574909857792/source-tweet/524932574909857792.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524969137660231680/source-tweet/524969137660231680.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524941720249978880/source-tweet/524941720249978880.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524953840127574016/source-tweet/524953840127574016.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524922507380670464/source-tweet/524922507380670464.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524943132409212928/source-tweet/524943132409212928.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525025463648137216/source-tweet/525025463648137216.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524945665827545089/source-tweet/524945665827545089.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524926643325132800/source-tweet/524926643325132800.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524959027516932096/source-tweet/524959027516932096.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524950428598153216/source-tweet/524950428598153216.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524931110279852032/source-tweet/524931110279852032.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524975142641086465/source-tweet/524975142641086465.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524942058042449920/source-tweet/524942058042449920.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524948264148160513/source-tweet/524948264148160513.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524930575472136192/source-tweet/524930575472136192.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524931913426157568/source-tweet/524931913426157568.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524956383318650880/source-tweet/524956383318650880.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524936137589411840/source-tweet/524936137589411840.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524941504796962816/source-tweet/524941504796962816.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524991067675189250/source-tweet/524991067675189250.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524925553489760256/source-tweet/524925553489760256.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524932935137628160/source-tweet/524932935137628160.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525040767317082113/source-tweet/525040767317082113.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524961908903149568/source-tweet/524961908903149568.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524934381828603904/source-tweet/524934381828603904.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525005886272843776/source-tweet/525005886272843776.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524947744809418752/source-tweet/524947744809418752.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525071376084791297/source-tweet/525071376084791297.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524942353355005952/source-tweet/524942353355005952.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524970851675176960/source-tweet/524970851675176960.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524973811092193280/source-tweet/524973811092193280.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524979946218061824/source-tweet/524979946218061824.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524926184069808129/source-tweet/524926184069808129.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524951711941922818/source-tweet/524951711941922818.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524959809402331137/source-tweet/524959809402331137.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525048557569454080/source-tweet/525048557569454080.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524958227768020992/source-tweet/524958227768020992.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524949069945634816/source-tweet/524949069945634816.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524943490887991296/source-tweet/524943490887991296.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524947149134774272/source-tweet/524947149134774272.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524957647653863424/source-tweet/524957647653863424.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524943114885427203/source-tweet/524943114885427203.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524936966492942336/source-tweet/524936966492942336.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524983403775799297/source-tweet/524983403775799297.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525039208139087872/source-tweet/525039208139087872.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524974318087061504/source-tweet/524974318087061504.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524956372199555072/source-tweet/524956372199555072.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525025695425380352/source-tweet/525025695425380352.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524962072262881280/source-tweet/524962072262881280.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525052223030845440/source-tweet/525052223030845440.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524965775036387329/source-tweet/524965775036387329.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524983146266505216/source-tweet/524983146266505216.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524939327399559170/source-tweet/524939327399559170.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524954833648771072/source-tweet/524954833648771072.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524953604717686784/source-tweet/524953604717686784.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525071638464049152/source-tweet/525071638464049152.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524976526891417600/source-tweet/524976526891417600.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525026947307700225/source-tweet/525026947307700225.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525049639016615937/source-tweet/525049639016615937.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524949306101743616/source-tweet/524949306101743616.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524923480173981696/source-tweet/524923480173981696.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525024518713397249/source-tweet/525024518713397249.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524946336190586880/source-tweet/524946336190586880.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525021697003782145/source-tweet/525021697003782145.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524926445496197120/source-tweet/524926445496197120.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524930293921107969/source-tweet/524930293921107969.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524945676443340800/source-tweet/524945676443340800.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525045145079533568/source-tweet/525045145079533568.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525062993898635264/source-tweet/525062993898635264.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524965406377664512/source-tweet/524965406377664512.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524952341561491459/source-tweet/524952341561491459.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524923676484177920/source-tweet/524923676484177920.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524949021744713728/source-tweet/524949021744713728.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524977669902172160/source-tweet/524977669902172160.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524934941914968064/source-tweet/524934941914968064.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524936872666353664/source-tweet/524936872666353664.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524935586025861121/source-tweet/524935586025861121.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524949887591653376/source-tweet/524949887591653376.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524943886553468929/source-tweet/524943886553468929.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524981945131102211/source-tweet/524981945131102211.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524927695633666049/source-tweet/524927695633666049.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524941041301225472/source-tweet/524941041301225472.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524955304199352320/source-tweet/524955304199352320.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525036393983381504/source-tweet/525036393983381504.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524951893467234304/source-tweet/524951893467234304.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524942149973188608/source-tweet/524942149973188608.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524999488067633152/source-tweet/524999488067633152.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524951314556796929/source-tweet/524951314556796929.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524943504192311296/source-tweet/524943504192311296.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525023025792835585/source-tweet/525023025792835585.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524952995034042368/source-tweet/524952995034042368.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524933399828369409/source-tweet/524933399828369409.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525028522361114625/source-tweet/525028522361114625.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524974106308251648/source-tweet/524974106308251648.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525034687245582337/source-tweet/525034687245582337.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525036240848945152/source-tweet/525036240848945152.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524967620588896256/source-tweet/524967620588896256.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525028171549523971/source-tweet/525028171549523971.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524944637489086464/source-tweet/524944637489086464.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524971124845985792/source-tweet/524971124845985792.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525044262648676353/source-tweet/525044262648676353.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524958743000125440/source-tweet/524958743000125440.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525029232691650561/source-tweet/525029232691650561.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524940659778920448/source-tweet/524940659778920448.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524980504324358144/source-tweet/524980504324358144.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524998906799988736/source-tweet/524998906799988736.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524991452527722496/source-tweet/524991452527722496.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524987184836005889/source-tweet/524987184836005889.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524948703850029056/source-tweet/524948703850029056.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525070933556350977/source-tweet/525070933556350977.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525039002307424256/source-tweet/525039002307424256.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525044004287971329/source-tweet/525044004287971329.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525040097272819712/source-tweet/525040097272819712.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524987990524047361/source-tweet/524987990524047361.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524932792925569024/source-tweet/524932792925569024.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525058825909731328/source-tweet/525058825909731328.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524942470472548352/source-tweet/524942470472548352.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524969679790817280/source-tweet/524969679790817280.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524950584349454337/source-tweet/524950584349454337.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524938162900967424/source-tweet/524938162900967424.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524941529928839168/source-tweet/524941529928839168.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524976486902951936/source-tweet/524976486902951936.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525027116287811584/source-tweet/525027116287811584.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524952407625957376/source-tweet/524952407625957376.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524942946765131776/source-tweet/524942946765131776.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524982699220803586/source-tweet/524982699220803586.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524924619812511746/source-tweet/524924619812511746.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524938282732642305/source-tweet/524938282732642305.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524947971679744000/source-tweet/524947971679744000.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524941124893700096/source-tweet/524941124893700096.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524925730053181440/source-tweet/524925730053181440.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524949315245322241/source-tweet/524949315245322241.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525057025785356288/source-tweet/525057025785356288.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525026219100995584/source-tweet/525026219100995584.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525001895631654912/source-tweet/525001895631654912.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525020326816927744/source-tweet/525020326816927744.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525057606151602176/source-tweet/525057606151602176.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525072912957452289/source-tweet/525072912957452289.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525023858831523841/source-tweet/525023858831523841.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524924034124107776/source-tweet/524924034124107776.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524940077978640384/source-tweet/524940077978640384.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525008463819464704/source-tweet/525008463819464704.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525030963781595136/source-tweet/525030963781595136.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524944569528750080/source-tweet/524944569528750080.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524929497205055488/source-tweet/524929497205055488.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524942548066791424/source-tweet/524942548066791424.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524994747883323392/source-tweet/524994747883323392.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524967376719466496/source-tweet/524967376719466496.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524944881941495809/source-tweet/524944881941495809.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524930671220105216/source-tweet/524930671220105216.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524929796862918656/source-tweet/524929796862918656.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525068915068923904/source-tweet/525068915068923904.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524983581983375360/source-tweet/524983581983375360.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525058633730891776/source-tweet/525058633730891776.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524957517294886912/source-tweet/524957517294886912.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525033778637721600/source-tweet/525033778637721600.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525006356731166720/source-tweet/525006356731166720.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524981557770350592/source-tweet/524981557770350592.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525040408653754368/source-tweet/525040408653754368.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524965165759221760/source-tweet/524965165759221760.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525033278555033601/source-tweet/525033278555033601.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524935085863481344/source-tweet/524935085863481344.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524949526487265280/source-tweet/524949526487265280.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/525051210349289472/source-tweet/525051210349289472.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/rumours/524923678472286209/source-tweet/524923678472286209.json']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r = list(filter(lambda fname: \"/rumours/\" in fname, all_file_list[1]))\n",
    "r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524961561677680640/source-tweet/524961561677680640.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524935943925796866/source-tweet/524935943925796866.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524955243185176576/source-tweet/524955243185176576.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524942952733220865/source-tweet/524942952733220865.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524959715957424128/source-tweet/524959715957424128.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524958409381384192/source-tweet/524958409381384192.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525026066944262144/source-tweet/525026066944262144.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524924987774631936/source-tweet/524924987774631936.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524950395110842369/source-tweet/524950395110842369.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524939362048290817/source-tweet/524939362048290817.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525031610136014848/source-tweet/525031610136014848.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525070912199335937/source-tweet/525070912199335937.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524930594950479873/source-tweet/524930594950479873.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524978912145985536/source-tweet/524978912145985536.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524937114539282433/source-tweet/524937114539282433.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524950507023245313/source-tweet/524950507023245313.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524963649455407105/source-tweet/524963649455407105.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524981463033581568/source-tweet/524981463033581568.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524955388190662657/source-tweet/524955388190662657.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524934142958788608/source-tweet/524934142958788608.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524976313766273025/source-tweet/524976313766273025.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524973444946202624/source-tweet/524973444946202624.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524965077318107137/source-tweet/524965077318107137.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524937421180653568/source-tweet/524937421180653568.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524971386838978560/source-tweet/524971386838978560.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524947036228317184/source-tweet/524947036228317184.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525049044201377793/source-tweet/525049044201377793.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525068387899031552/source-tweet/525068387899031552.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525020980037816320/source-tweet/525020980037816320.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524967510563889152/source-tweet/524967510563889152.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524998460848611328/source-tweet/524998460848611328.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524955317201666048/source-tweet/524955317201666048.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524937675115999232/source-tweet/524937675115999232.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524983599851130880/source-tweet/524983599851130880.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525044166519451648/source-tweet/525044166519451648.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525008469385302017/source-tweet/525008469385302017.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524955251464736768/source-tweet/524955251464736768.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524934300459077632/source-tweet/524934300459077632.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524985046198157313/source-tweet/524985046198157313.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524970444156571648/source-tweet/524970444156571648.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524944537618485248/source-tweet/524944537618485248.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524983236712476672/source-tweet/524983236712476672.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524922078638903296/source-tweet/524922078638903296.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524951439417044992/source-tweet/524951439417044992.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524984209505812480/source-tweet/524984209505812480.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525018288330272768/source-tweet/525018288330272768.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524979881525137409/source-tweet/524979881525137409.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524991470416461824/source-tweet/524991470416461824.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524965701535412224/source-tweet/524965701535412224.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524957773810135040/source-tweet/524957773810135040.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524956449836130304/source-tweet/524956449836130304.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524936008870400000/source-tweet/524936008870400000.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524971258996588544/source-tweet/524971258996588544.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524991832451981312/source-tweet/524991832451981312.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525068253341970432/source-tweet/525068253341970432.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524973223356940288/source-tweet/524973223356940288.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524996756220956672/source-tweet/524996756220956672.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524938009834450945/source-tweet/524938009834450945.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524979548195024898/source-tweet/524979548195024898.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525048410831736832/source-tweet/525048410831736832.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524993612279336960/source-tweet/524993612279336960.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524947368865964033/source-tweet/524947368865964033.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524991119164473344/source-tweet/524991119164473344.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524984505376182272/source-tweet/524984505376182272.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524957554829717504/source-tweet/524957554829717504.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525058578365698048/source-tweet/525058578365698048.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524950679274942464/source-tweet/524950679274942464.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525007169956347904/source-tweet/525007169956347904.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524950159160250368/source-tweet/524950159160250368.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524944178586091520/source-tweet/524944178586091520.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525070811439579138/source-tweet/525070811439579138.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525045203648790528/source-tweet/525045203648790528.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524939142485262337/source-tweet/524939142485262337.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525055967076290560/source-tweet/525055967076290560.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525056022046851074/source-tweet/525056022046851074.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524950455743291392/source-tweet/524950455743291392.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524943658748219392/source-tweet/524943658748219392.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524966946467749888/source-tweet/524966946467749888.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524947071770435585/source-tweet/524947071770435585.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524950339074523137/source-tweet/524950339074523137.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524960580944560128/source-tweet/524960580944560128.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524930851747164160/source-tweet/524930851747164160.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525060022569422849/source-tweet/525060022569422849.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525016983646244865/source-tweet/525016983646244865.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524946753154727936/source-tweet/524946753154727936.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524961055211286528/source-tweet/524961055211286528.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524940803601227777/source-tweet/524940803601227777.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524978422188376064/source-tweet/524978422188376064.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524997030536830976/source-tweet/524997030536830976.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525035832336736257/source-tweet/525035832336736257.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525012738280792065/source-tweet/525012738280792065.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524945814796644352/source-tweet/524945814796644352.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524983395026497536/source-tweet/524983395026497536.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524954661703680000/source-tweet/524954661703680000.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525063393309630464/source-tweet/525063393309630464.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524970142719963136/source-tweet/524970142719963136.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524981427260366848/source-tweet/524981427260366848.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524960221694005248/source-tweet/524960221694005248.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524937560276344832/source-tweet/524937560276344832.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524964457936863232/source-tweet/524964457936863232.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524955385480753153/source-tweet/524955385480753153.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524975031059632128/source-tweet/524975031059632128.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524951948337098752/source-tweet/524951948337098752.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524952005157347328/source-tweet/524952005157347328.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524938146119966720/source-tweet/524938146119966720.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524977767688200192/source-tweet/524977767688200192.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525000220371734528/source-tweet/525000220371734528.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524937635698327553/source-tweet/524937635698327553.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525001708318240768/source-tweet/525001708318240768.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524993009440403456/source-tweet/524993009440403456.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524988783213547520/source-tweet/524988783213547520.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524974135030874114/source-tweet/524974135030874114.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524979070288019456/source-tweet/524979070288019456.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524936456742379520/source-tweet/524936456742379520.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524974900017373184/source-tweet/524974900017373184.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524950321064607744/source-tweet/524950321064607744.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524947129354420225/source-tweet/524947129354420225.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524964630221705216/source-tweet/524964630221705216.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524959390332239872/source-tweet/524959390332239872.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524959778125385728/source-tweet/524959778125385728.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524976835881205760/source-tweet/524976835881205760.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524928297986433024/source-tweet/524928297986433024.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524956168754843648/source-tweet/524956168754843648.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524986210402705408/source-tweet/524986210402705408.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525034457086959616/source-tweet/525034457086959616.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524961016904294400/source-tweet/524961016904294400.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524933764330586112/source-tweet/524933764330586112.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524957105296404480/source-tweet/524957105296404480.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524979184297590784/source-tweet/524979184297590784.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525033443076628480/source-tweet/525033443076628480.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524979089787330560/source-tweet/524979089787330560.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525059193074487296/source-tweet/525059193074487296.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524947716393414656/source-tweet/524947716393414656.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524979279701241856/source-tweet/524979279701241856.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524940623594287104/source-tweet/524940623594287104.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524987263835721728/source-tweet/524987263835721728.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524948618776944640/source-tweet/524948618776944640.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524986649315667968/source-tweet/524986649315667968.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525012247249428482/source-tweet/525012247249428482.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524966328734863360/source-tweet/524966328734863360.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524922499466022913/source-tweet/524922499466022913.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525005137405018112/source-tweet/525005137405018112.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525047079325155329/source-tweet/525047079325155329.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524954262657581057/source-tweet/524954262657581057.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524968466559033345/source-tweet/524968466559033345.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524923148576518144/source-tweet/524923148576518144.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524927789892653056/source-tweet/524927789892653056.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525069912679923712/source-tweet/525069912679923712.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525006853626159105/source-tweet/525006853626159105.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525037540508594176/source-tweet/525037540508594176.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524995812854489089/source-tweet/524995812854489089.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525007337724325889/source-tweet/525007337724325889.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524974786460778497/source-tweet/524974786460778497.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524953530193301505/source-tweet/524953530193301505.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525050632211021825/source-tweet/525050632211021825.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524933534692442112/source-tweet/524933534692442112.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524949435886096385/source-tweet/524949435886096385.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524956415589621760/source-tweet/524956415589621760.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524995375468249088/source-tweet/524995375468249088.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524932558275235840/source-tweet/524932558275235840.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524969415318982657/source-tweet/524969415318982657.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524944981753356288/source-tweet/524944981753356288.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524948710812581888/source-tweet/524948710812581888.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524964278030172160/source-tweet/524964278030172160.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525047104239333377/source-tweet/525047104239333377.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524941272549969920/source-tweet/524941272549969920.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524963002768842752/source-tweet/524963002768842752.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524965026256662528/source-tweet/524965026256662528.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524998881676107776/source-tweet/524998881676107776.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525041601890881536/source-tweet/525041601890881536.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525053517531062272/source-tweet/525053517531062272.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524937597206790144/source-tweet/524937597206790144.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524953508978909185/source-tweet/524953508978909185.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524955102554361856/source-tweet/524955102554361856.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524935023380545537/source-tweet/524935023380545537.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525038296921960449/source-tweet/525038296921960449.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524928960615186432/source-tweet/524928960615186432.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524958383468978176/source-tweet/524958383468978176.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524987150413336576/source-tweet/524987150413336576.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524977992683237376/source-tweet/524977992683237376.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525000349648949250/source-tweet/525000349648949250.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524979702940045313/source-tweet/524979702940045313.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524932292406681601/source-tweet/524932292406681601.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524949939755810817/source-tweet/524949939755810817.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525040252948189186/source-tweet/525040252948189186.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524931083797004288/source-tweet/524931083797004288.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524961513006964736/source-tweet/524961513006964736.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525009541671702528/source-tweet/525009541671702528.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525040492388823041/source-tweet/525040492388823041.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524967231902318592/source-tweet/524967231902318592.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524924954287296512/source-tweet/524924954287296512.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524995942995746819/source-tweet/524995942995746819.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525049481679892481/source-tweet/525049481679892481.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524932635177795585/source-tweet/524932635177795585.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525065123917221888/source-tweet/525065123917221888.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524949897221795840/source-tweet/524949897221795840.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524956731638841344/source-tweet/524956731638841344.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525055273816555520/source-tweet/525055273816555520.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525036180379676673/source-tweet/525036180379676673.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524949003834634240/source-tweet/524949003834634240.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524993166651318273/source-tweet/524993166651318273.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524993523746373633/source-tweet/524993523746373633.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524970488590651395/source-tweet/524970488590651395.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524967811081195520/source-tweet/524967811081195520.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524968564306882560/source-tweet/524968564306882560.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524958238316298240/source-tweet/524958238316298240.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524929550493712384/source-tweet/524929550493712384.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524934419270742017/source-tweet/524934419270742017.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524960277348233216/source-tweet/524960277348233216.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524995166953033729/source-tweet/524995166953033729.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524947604690702336/source-tweet/524947604690702336.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525032458417610752/source-tweet/525032458417610752.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524929014621032451/source-tweet/524929014621032451.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524949243883425793/source-tweet/524949243883425793.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524941119403335681/source-tweet/524941119403335681.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524958516822671360/source-tweet/524958516822671360.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524948790227517440/source-tweet/524948790227517440.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524943804051496961/source-tweet/524943804051496961.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525026031833718785/source-tweet/525026031833718785.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524954105471840256/source-tweet/524954105471840256.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524958173245870080/source-tweet/524958173245870080.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524952808110694400/source-tweet/524952808110694400.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524988555178045440/source-tweet/524988555178045440.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524980483479076864/source-tweet/524980483479076864.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524930678983753728/source-tweet/524930678983753728.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524936931059056640/source-tweet/524936931059056640.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524980402495430656/source-tweet/524980402495430656.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524943638812299265/source-tweet/524943638812299265.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524936075870228480/source-tweet/524936075870228480.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525052526593601537/source-tweet/525052526593601537.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524935906596499456/source-tweet/524935906596499456.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524945225253675008/source-tweet/524945225253675008.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524934507892580352/source-tweet/524934507892580352.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524958384060366849/source-tweet/524958384060366849.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525019265888702464/source-tweet/525019265888702464.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524968330172850177/source-tweet/524968330172850177.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524973920722878464/source-tweet/524973920722878464.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524935731148775424/source-tweet/524935731148775424.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524992063553953792/source-tweet/524992063553953792.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524952322703900672/source-tweet/524952322703900672.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524950117192044544/source-tweet/524950117192044544.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524977812852461568/source-tweet/524977812852461568.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524967631171117056/source-tweet/524967631171117056.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524997921042071553/source-tweet/524997921042071553.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524974566830247936/source-tweet/524974566830247936.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524951279760867328/source-tweet/524951279760867328.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524929559909916672/source-tweet/524929559909916672.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525016728091500545/source-tweet/525016728091500545.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524934897367285760/source-tweet/524934897367285760.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524979756371288064/source-tweet/524979756371288064.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524923403183333376/source-tweet/524923403183333376.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524944571256434688/source-tweet/524944571256434688.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525021050627973120/source-tweet/525021050627973120.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524938288277118976/source-tweet/524938288277118976.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524958250706681856/source-tweet/524958250706681856.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524993172271665153/source-tweet/524993172271665153.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524977830124617728/source-tweet/524977830124617728.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525028955049320448/source-tweet/525028955049320448.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524983828000276480/source-tweet/524983828000276480.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525030331775463424/source-tweet/525030331775463424.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524969431534153728/source-tweet/524969431534153728.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525006847095615489/source-tweet/525006847095615489.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524990519777443840/source-tweet/524990519777443840.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524928863714168832/source-tweet/524928863714168832.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524939818347028480/source-tweet/524939818347028480.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524949136173694976/source-tweet/524949136173694976.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524978855531253760/source-tweet/524978855531253760.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524966572277129216/source-tweet/524966572277129216.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524980166289002496/source-tweet/524980166289002496.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524945101542678529/source-tweet/524945101542678529.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524938137320312832/source-tweet/524938137320312832.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524995776578330624/source-tweet/524995776578330624.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524998110268096513/source-tweet/524998110268096513.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525045709402148864/source-tweet/525045709402148864.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524964129745174528/source-tweet/524964129745174528.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525069752566575104/source-tweet/525069752566575104.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525070461936222208/source-tweet/525070461936222208.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525059810312847360/source-tweet/525059810312847360.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524943976902971392/source-tweet/524943976902971392.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524974518960652290/source-tweet/524974518960652290.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524981565462704129/source-tweet/524981565462704129.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524982907052765184/source-tweet/524982907052765184.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524986820850106370/source-tweet/524986820850106370.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524979152944779265/source-tweet/524979152944779265.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524979179235069952/source-tweet/524979179235069952.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524953077456318464/source-tweet/524953077456318464.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524958992330522624/source-tweet/524958992330522624.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524930499790131200/source-tweet/524930499790131200.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525005134624215041/source-tweet/525005134624215041.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524942851944493056/source-tweet/524942851944493056.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524967572689522688/source-tweet/524967572689522688.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525052976562712576/source-tweet/525052976562712576.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524961070465945600/source-tweet/524961070465945600.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525015193425031168/source-tweet/525015193425031168.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525019992337551360/source-tweet/525019992337551360.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524933600433938432/source-tweet/524933600433938432.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525011747007377408/source-tweet/525011747007377408.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524963878904823808/source-tweet/524963878904823808.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524948520605069312/source-tweet/524948520605069312.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524965597793107968/source-tweet/524965597793107968.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525031823504850944/source-tweet/525031823504850944.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525071809226346496/source-tweet/525071809226346496.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524979925343014912/source-tweet/524979925343014912.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524955083297931264/source-tweet/524955083297931264.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524949028728221698/source-tweet/524949028728221698.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525015836226895875/source-tweet/525015836226895875.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525022821534429185/source-tweet/525022821534429185.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524964279456653312/source-tweet/524964279456653312.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524946808271671297/source-tweet/524946808271671297.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524961589406216192/source-tweet/524961589406216192.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525021103559696384/source-tweet/525021103559696384.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524965047647617024/source-tweet/524965047647617024.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524939347699585025/source-tweet/524939347699585025.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525034332474601472/source-tweet/525034332474601472.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524955871689052160/source-tweet/524955871689052160.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525019469983531008/source-tweet/525019469983531008.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524976857822011394/source-tweet/524976857822011394.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524924660480487424/source-tweet/524924660480487424.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524954161054756864/source-tweet/524954161054756864.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524950203476869120/source-tweet/524950203476869120.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524960245509287936/source-tweet/524960245509287936.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524926698165633024/source-tweet/524926698165633024.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524943226948820992/source-tweet/524943226948820992.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524960851493924864/source-tweet/524960851493924864.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525069509342683137/source-tweet/525069509342683137.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524944978288852992/source-tweet/524944978288852992.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524937243518312448/source-tweet/524937243518312448.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524954241547632640/source-tweet/524954241547632640.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525012561214058496/source-tweet/525012561214058496.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524955170200121344/source-tweet/524955170200121344.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524947475858472960/source-tweet/524947475858472960.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524960814944747520/source-tweet/524960814944747520.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524944610985263104/source-tweet/524944610985263104.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525069189447704576/source-tweet/525069189447704576.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524953569762766848/source-tweet/524953569762766848.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525012912562536448/source-tweet/525012912562536448.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524980775741956097/source-tweet/524980775741956097.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525044725926330368/source-tweet/525044725926330368.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524978999534317568/source-tweet/524978999534317568.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524945585539776512/source-tweet/524945585539776512.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524949884143927296/source-tweet/524949884143927296.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524955850591334403/source-tweet/524955850591334403.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524962450786226176/source-tweet/524962450786226176.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525033697045925888/source-tweet/525033697045925888.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525063740769972224/source-tweet/525063740769972224.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524994218964815873/source-tweet/524994218964815873.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524947030616313856/source-tweet/524947030616313856.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525050193360613376/source-tweet/525050193360613376.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524958498770395137/source-tweet/524958498770395137.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525003253185277952/source-tweet/525003253185277952.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524995620088864768/source-tweet/524995620088864768.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524939821815721984/source-tweet/524939821815721984.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525046232209956865/source-tweet/525046232209956865.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524994965366382593/source-tweet/524994965366382593.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524964166936068097/source-tweet/524964166936068097.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524925600318754816/source-tweet/524925600318754816.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524926528342487041/source-tweet/524926528342487041.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524943615437848576/source-tweet/524943615437848576.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525007810871164929/source-tweet/525007810871164929.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524946461621235712/source-tweet/524946461621235712.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525061246702587904/source-tweet/525061246702587904.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525030165190303744/source-tweet/525030165190303744.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524957974226550784/source-tweet/524957974226550784.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525066628359127041/source-tweet/525066628359127041.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524934004152082432/source-tweet/524934004152082432.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524971165056782337/source-tweet/524971165056782337.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524947414050816000/source-tweet/524947414050816000.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524968293577158656/source-tweet/524968293577158656.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524957872296583168/source-tweet/524957872296583168.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524957925157384192/source-tweet/524957925157384192.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525010084003213312/source-tweet/525010084003213312.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524982954880413696/source-tweet/524982954880413696.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524999079127162880/source-tweet/524999079127162880.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525041399344168961/source-tweet/525041399344168961.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524951696376487936/source-tweet/524951696376487936.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524937574281121793/source-tweet/524937574281121793.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525020733622464512/source-tweet/525020733622464512.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524978196589334529/source-tweet/524978196589334529.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525018717038854144/source-tweet/525018717038854144.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525025114111635456/source-tweet/525025114111635456.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524934090542182400/source-tweet/524934090542182400.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524940940721418240/source-tweet/524940940721418240.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524927809773649922/source-tweet/524927809773649922.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524968185016360960/source-tweet/524968185016360960.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524981890567385088/source-tweet/524981890567385088.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524942802908876801/source-tweet/524942802908876801.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525012689568161792/source-tweet/525012689568161792.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524963322525798400/source-tweet/524963322525798400.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524944146025312256/source-tweet/524944146025312256.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524951456362012672/source-tweet/524951456362012672.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524957978806743040/source-tweet/524957978806743040.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525044678765187072/source-tweet/525044678765187072.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524985689905967105/source-tweet/524985689905967105.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524934648473079808/source-tweet/524934648473079808.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524954313337364480/source-tweet/524954313337364480.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524988027425554432/source-tweet/524988027425554432.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525026973459165184/source-tweet/525026973459165184.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524956007324082177/source-tweet/524956007324082177.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524955386697515008/source-tweet/524955386697515008.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524941318389501953/source-tweet/524941318389501953.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524989886483673088/source-tweet/524989886483673088.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524953767402553345/source-tweet/524953767402553345.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524965242213007360/source-tweet/524965242213007360.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524928223499784192/source-tweet/524928223499784192.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524966596897685504/source-tweet/524966596897685504.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524940452383170560/source-tweet/524940452383170560.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524942687003484161/source-tweet/524942687003484161.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524944788525973505/source-tweet/524944788525973505.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524935146806738944/source-tweet/524935146806738944.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524956518404198400/source-tweet/524956518404198400.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524936881348558849/source-tweet/524936881348558849.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524959090125320193/source-tweet/524959090125320193.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524958741259886592/source-tweet/524958741259886592.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524956648151191553/source-tweet/524956648151191553.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524985092343889920/source-tweet/524985092343889920.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/525027958835408896/source-tweet/525027958835408896.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524945095322513408/source-tweet/524945095322513408.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524966565788520448/source-tweet/524966565788520448.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524925223226081282/source-tweet/524925223226081282.json',\n",
       " '/home/hadoop/pheme-rnr-dataset/ottawashooting/non-rumours/524930400272265216/source-tweet/524930400272265216.json']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nr = list(filter(lambda fname: \"non-rumours\" in fname, all_file_list[1]))\n",
    "nr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(470, 420)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(r), len(nr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "r_list = random.sample(all_file_list[1], len(all_file_list[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "dev_event = list(map(lambda fname: fname.split(\"source-tweet\")[0].replace(\"/home/hadoop/pheme-rnr-dataset\", \".\"), r_list[:500]))\n",
    "te_event = list(map(lambda fname: fname.split(\"source-tweet\")[0].replace(\"/home/hadoop/pheme-rnr-dataset\", \".\"), r_list[500:]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "with open(\"dev_rm.sh\", \"w\") as fw:\n",
    "    for event in dev_event:\n",
    "        fw.write(\"rm -r %s \\n\"%event)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "with open(\"te_rm.sh\", \"w\") as fw:\n",
    "    for event in te_event:\n",
    "        fw.write(\"rm -r %s \\n\"%event)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 拆分Weibo数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import tqdm\n",
    "import json\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "weibo_dir = \"/home/hadoop/Rumdect/Weibo/\"\n",
    "weibo_file = \"/home/hadoop/Rumdect/Weibo.txt\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 4664/4664 [00:58<00:00, 131.07it/s]\n"
     ]
    }
   ],
   "source": [
    "ids = []\n",
    "contents = []\n",
    "file_list = list( map(lambda fname:os.path.join(weibo_dir, fname), os.listdir(weibo_dir)))\n",
    "for fname in tqdm.tqdm(file_list):\n",
    "    with open(fname, \"r\") as fr:\n",
    "        data = json.load(fr)\n",
    "    ids.append(data[0]['id'])\n",
    "    contents.append(data[0]['original_text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['id'] = np.array(ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['content'] = np.array(contents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['cat_id'] = np.ones([4664], dtype=np.int64)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['3911258862284667', '3910905915934383', '3653016244814594', ...,\n",
       "       '3602611439603519', '3488860002774854', '3534383233018984'],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['id'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "ids = []\n",
    "labels = []\n",
    "with open(\"../Rumdect/Weibo.txt\", \"r\") as fr:\n",
    "    for line in fr:\n",
    "        s = line.split(\"\\t\")\n",
    "        ids.append(s[0].split(\":\")[1].strip())\n",
    "        labels.append(int(s[1].split(\":\")[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_df = pd.DataFrame()\n",
    "label_df['id'] = np.array(ids)\n",
    "label_df['label'] = np.array(labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "ddf = pd.merge(df, label_df, on=\"id\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "ddf.to_csv(\"./data/WeiboContent.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Weibo数据分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dataloader import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch.utils.data import DataLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "wc = ContentSet(\"./data/WeiboContent.csv\", label_type=-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "wc_loader = DataLoader(wc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from WeiboTopic import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = TopicCLS(train_file=\"./data/topic_train.csv\", dev_file=\"./data/topic_dev.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.load_model(model_file=\"./model/topic_model.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "rst = model.infer(wc_loader)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([4664])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rst.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "ddf['cat_id'] = rst.cpu().numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "ddf.to_csv(\"./data/WeiboContent.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### 分类后的数据分析整理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"./data/topic_dev.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>content</th>\n",
       "      <th>picture_lists</th>\n",
       "      <th>category</th>\n",
       "      <th>cat_id</th>\n",
       "      <th>comment_2c</th>\n",
       "      <th>comment_all</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4076180594044139</td>\n",
       "      <td>#快讯#【吉林松原石化在建项目发生闪爆致3人死亡】17日上午，吉林省松原石油化工股份有限公司...</td>\n",
       "      <td>92d49cb7f2d4d18796ace5884ab6d6c6.jpg</td>\n",
       "      <td>社会生活</td>\n",
       "      <td>7</td>\n",
       "      <td>转发微博\\t又是临时工吧\\t</td>\n",
       "      <td>转发微博\\t又是临时工吧\\t</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3837436721464605</td>\n",
       "      <td>是我朋友的帮帮忙谢谢！寻人启事13940292999。有线索酬金10万帮忙扩散，今天上午一个...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>社会生活</td>\n",
       "      <td>7</td>\n",
       "      <td>谣言。\\t你受骗上当了，这是虚假消息！请看辟谣网址（O谣场现形记）。尽快删除帖子，因为已被举...</td>\n",
       "      <td>谣言。\\t你受骗上当了，这是虚假消息！请看辟谣网址（O谣场现形记）。尽快删除帖子，因为已被举...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4298227823966934</td>\n",
       "      <td>《让子弹飞》是一部好电影。想要让老百姓捐钱，得先让乡绅捐钱，乡绅们捐完钱，老百姓才会跟着捐。...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>社会生活</td>\n",
       "      <td>7</td>\n",
       "      <td>救命的这几百亿也要骗，唉\\t开门，查水表的\\t</td>\n",
       "      <td>救命的这几百亿也要骗，唉\\t开门，查水表的\\t所以，别入戏太深\\t花花有小号吗？备一个[二哈...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4173976822981112</td>\n",
       "      <td>购置税上调有望推动年底购车意欲料未来两个月销售可保持平稳增长BY中国金洋资产管理董事总经理郭...</td>\n",
       "      <td>115e011512f5de6745a516c755712a03.jpg</td>\n",
       "      <td>政治</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3977158516575847</td>\n",
       "      <td>\\t\\t【医生累倒在手术室外 瘫坐地上喝葡萄糖】19日，一张医生瘫坐在地上喝葡糖的照片引起关...</td>\n",
       "      <td>29a4920904e73657891c52ee8f8f68ee.jpg</td>\n",
       "      <td>医药健康</td>\n",
       "      <td>3</td>\n",
       "      <td>心疼医生\\t回复@乌龟小姐背着自己的壳:并不是，葡萄糖的是C6H12O6，人体可以直接吸收的。\\t</td>\n",
       "      <td>心疼医生\\t回复@乌龟小姐背着自己的壳:并不是，葡萄糖的是C6H12O6，人体可以直接吸收的...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 id                                            content  \\\n",
       "0  4076180594044139  #快讯#【吉林松原石化在建项目发生闪爆致3人死亡】17日上午，吉林省松原石油化工股份有限公司...   \n",
       "1  3837436721464605  是我朋友的帮帮忙谢谢！寻人启事13940292999。有线索酬金10万帮忙扩散，今天上午一个...   \n",
       "2  4298227823966934  《让子弹飞》是一部好电影。想要让老百姓捐钱，得先让乡绅捐钱，乡绅们捐完钱，老百姓才会跟着捐。...   \n",
       "3  4173976822981112  购置税上调有望推动年底购车意欲料未来两个月销售可保持平稳增长BY中国金洋资产管理董事总经理郭...   \n",
       "4  3977158516575847  \\t\\t【医生累倒在手术室外 瘫坐地上喝葡萄糖】19日，一张医生瘫坐在地上喝葡糖的照片引起关...   \n",
       "\n",
       "                          picture_lists category  cat_id  \\\n",
       "0  92d49cb7f2d4d18796ace5884ab6d6c6.jpg     社会生活       7   \n",
       "1                                   NaN     社会生活       7   \n",
       "2                                   NaN     社会生活       7   \n",
       "3  115e011512f5de6745a516c755712a03.jpg       政治       0   \n",
       "4  29a4920904e73657891c52ee8f8f68ee.jpg     医药健康       3   \n",
       "\n",
       "                                          comment_2c  \\\n",
       "0                                     转发微博\\t又是临时工吧\\t   \n",
       "1  谣言。\\t你受骗上当了，这是虚假消息！请看辟谣网址（O谣场现形记）。尽快删除帖子，因为已被举...   \n",
       "2                            救命的这几百亿也要骗，唉\\t开门，查水表的\\t   \n",
       "3                                                NaN   \n",
       "4  心疼医生\\t回复@乌龟小姐背着自己的壳:并不是，葡萄糖的是C6H12O6，人体可以直接吸收的。\\t   \n",
       "\n",
       "                                         comment_all  \n",
       "0                                     转发微博\\t又是临时工吧\\t  \n",
       "1  谣言。\\t你受骗上当了，这是虚假消息！请看辟谣网址（O谣场现形记）。尽快删除帖子，因为已被举...  \n",
       "2  救命的这几百亿也要骗，唉\\t开门，查水表的\\t所以，别入戏太深\\t花花有小号吗？备一个[二哈...  \n",
       "3                                                NaN  \n",
       "4  心疼医生\\t回复@乌龟小姐背着自己的壳:并不是，葡萄糖的是C6H12O6，人体可以直接吸收的...  "
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cat_id</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>category</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>军事</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>医药健康</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>政治</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>教育考试</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>文体娱乐</th>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>疫情</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>社会生活</th>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>科技</th>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>财经商业</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          cat_id\n",
       "category        \n",
       "军事             5\n",
       "医药健康           3\n",
       "政治             0\n",
       "教育考试           4\n",
       "文体娱乐           6\n",
       "疫情             1\n",
       "社会生活           7\n",
       "科技             8\n",
       "财经商业           2"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[['category', 'cat_id']].groupby('category').mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "dic = {\n",
    "        \"军事\":5,\n",
    "        \"医药健康\":3,\n",
    "        \"政治\":0,\n",
    "        \"教育考试\":4,\n",
    "        \"文体娱乐\":6,\n",
    "        \"疫情\":1,\n",
    "        \"社会生活\":7,\n",
    "        \"科技\":8,\n",
    "        \"财经商业\":2,\n",
    "      }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "gp = ddf.groupby(\"cat_id\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>content</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cat_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>259</td>\n",
       "      <td>259</td>\n",
       "      <td>259</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>11</td>\n",
       "      <td>11</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>244</td>\n",
       "      <td>244</td>\n",
       "      <td>244</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>931</td>\n",
       "      <td>931</td>\n",
       "      <td>931</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>42</td>\n",
       "      <td>42</td>\n",
       "      <td>42</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>129</td>\n",
       "      <td>129</td>\n",
       "      <td>129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>868</td>\n",
       "      <td>868</td>\n",
       "      <td>868</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1920</td>\n",
       "      <td>1920</td>\n",
       "      <td>1920</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>260</td>\n",
       "      <td>260</td>\n",
       "      <td>260</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          id  content  label\n",
       "cat_id                      \n",
       "0        259      259    259\n",
       "1         11       11     11\n",
       "2        244      244    244\n",
       "3        931      931    931\n",
       "4         42       42     42\n",
       "5        129      129    129\n",
       "6        868      868    868\n",
       "7       1920     1920   1920\n",
       "8        260      260    260"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gp.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>content</th>\n",
       "      <th>cat_id</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>3513862995205247</td>\n",
       "      <td>TVXQ France 暂时只救回了图,18号场的红海.擦!这位子拍出来的视频肯定嗲啊┭┮﹏┭┮</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>86</th>\n",
       "      <td>3489099614454084</td>\n",
       "      <td>【中日开战】路透社短讯：中国东海舰队徐州号护卫舰和一艘常规动力潜艇以及两艘海监船在钓鱼岛海域...</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>160</th>\n",
       "      <td>3650475917913991</td>\n",
       "      <td>#财新网摘#【美国老人旅朝被捕 电视现身承认“宿罪”】85岁的美国人纽曼与同伴通过旅行社安排...</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>183</th>\n",
       "      <td>3909978840125858</td>\n",
       "      <td>最伟大的键盘党组织，宣布对ISIS全面开战 http://t.cn/RURqBx0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>205</th>\n",
       "      <td>3593500484588570</td>\n",
       "      <td>看了这个我想哭。。30多年过去了，美军一直在寻找在柬埔寨，越南，老挝，失踪美军士兵遗骸，“挖...</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4514</th>\n",
       "      <td>3918796504841684</td>\n",
       "      <td>#那一天#【两弹一星元勋钱学森】①他少学有成，远赴重洋参物理之奥，而立之年便成为一流火箭专家...</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4523</th>\n",
       "      <td>3593779291912843</td>\n",
       "      <td>看了这个我想哭。。30多年过去了，美军一直在寻找在柬埔寨，越南，老挝，失踪美军士兵遗骸，“挖...</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4539</th>\n",
       "      <td>3917848134960585</td>\n",
       "      <td>[汗]</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4594</th>\n",
       "      <td>3570229739002279</td>\n",
       "      <td>日本右翼分子抵钓鱼岛海域，日本13艘海保厅船护送保驾……这不是民间行为，已经是官方的正式行动...</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4648</th>\n",
       "      <td>3911529298581977</td>\n",
       "      <td>【联合国通过打击IS决议】据@联合国 安理会刚刚以15票赞成一致通过决议，促请有能力的会员国...</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>129 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    id                                            content  \\\n",
       "45    3513862995205247   TVXQ France 暂时只救回了图,18号场的红海.擦!这位子拍出来的视频肯定嗲啊┭┮﹏┭┮   \n",
       "86    3489099614454084  【中日开战】路透社短讯：中国东海舰队徐州号护卫舰和一艘常规动力潜艇以及两艘海监船在钓鱼岛海域...   \n",
       "160   3650475917913991  #财新网摘#【美国老人旅朝被捕 电视现身承认“宿罪”】85岁的美国人纽曼与同伴通过旅行社安排...   \n",
       "183   3909978840125858          最伟大的键盘党组织，宣布对ISIS全面开战 http://t.cn/RURqBx0   \n",
       "205   3593500484588570  看了这个我想哭。。30多年过去了，美军一直在寻找在柬埔寨，越南，老挝，失踪美军士兵遗骸，“挖...   \n",
       "...                ...                                                ...   \n",
       "4514  3918796504841684  #那一天#【两弹一星元勋钱学森】①他少学有成，远赴重洋参物理之奥，而立之年便成为一流火箭专家...   \n",
       "4523  3593779291912843  看了这个我想哭。。30多年过去了，美军一直在寻找在柬埔寨，越南，老挝，失踪美军士兵遗骸，“挖...   \n",
       "4539  3917848134960585                                                [汗]   \n",
       "4594  3570229739002279  日本右翼分子抵钓鱼岛海域，日本13艘海保厅船护送保驾……这不是民间行为，已经是官方的正式行动...   \n",
       "4648  3911529298581977  【联合国通过打击IS决议】据@联合国 安理会刚刚以15票赞成一致通过决议，促请有能力的会员国...   \n",
       "\n",
       "      cat_id  label  \n",
       "45         5      0  \n",
       "86         5      1  \n",
       "160        5      1  \n",
       "183        5      0  \n",
       "205        5      1  \n",
       "...      ...    ...  \n",
       "4514       5      0  \n",
       "4523       5      1  \n",
       "4539       5      0  \n",
       "4594       5      1  \n",
       "4648       5      0  \n",
       "\n",
       "[129 rows x 4 columns]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ddf[ddf.cat_id==5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>content</th>\n",
       "      <th>cat_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3911258862284667</td>\n",
       "      <td>#拍案#【郑州男子开车撞死4名拆迁人员 今被执行死刑】遵照最高法下达的执行死刑命令，郑州中院...</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3912105105056296</td>\n",
       "      <td>【关于劳动合同，你不得不知的8件事】求职找工作，拿到合同就万事大吉？连细节都没看清就挥着笔杆...</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>3919158834602739</td>\n",
       "      <td>【请为暴雪中的他们点赞[求关注]】据测算，新疆乌鲁木齐昨日降雪量达到1951年有气象记录以来...</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>3486783347798216</td>\n",
       "      <td>【紧急发布】刚收到家长私信：“我实在没粉丝，请各位名人与好心人帮忙转发一下。刘皓曦，男，三岁...</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>3601551484167477</td>\n",
       "      <td>刚才，百度急忙把“抢尸县长”贺遵庆个人资料简历修改了，“把维稳学硕士学位”改成了“硕士学位”...</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4657</th>\n",
       "      <td>3521716896494896</td>\n",
       "      <td>国家最新出台的《新资源食品管理办法》中猫属于界定中的第一项无食用习惯的动物，猫拥有不被食用的...</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4660</th>\n",
       "      <td>3495729601661914</td>\n",
       "      <td>【静安，再见……】【全文】http://t.cn/zl5OXSy</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4661</th>\n",
       "      <td>3602611439603519</td>\n",
       "      <td>【是真还是假？】2012年杜春晓以六百多分的成绩考入武汉理工，今年暑假在广水强海制鞋厂做暑期...</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4662</th>\n",
       "      <td>3488860002774854</td>\n",
       "      <td>9月10 日。宁波妇儿医院今天下午1点48分，一妇女及一婴儿在住院楼跳楼，后抢救无效死亡。具...</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4663</th>\n",
       "      <td>3534383233018984</td>\n",
       "      <td>【杭州出现新公交骗局】杭州最近发生了系列高级骗局，乘公交车的亲人们警惕哦。骗子最新手法：在公...</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1920 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    id                                            content  \\\n",
       "0     3911258862284667  #拍案#【郑州男子开车撞死4名拆迁人员 今被执行死刑】遵照最高法下达的执行死刑命令，郑州中院...   \n",
       "3     3912105105056296  【关于劳动合同，你不得不知的8件事】求职找工作，拿到合同就万事大吉？连细节都没看清就挥着笔杆...   \n",
       "5     3919158834602739  【请为暴雪中的他们点赞[求关注]】据测算，新疆乌鲁木齐昨日降雪量达到1951年有气象记录以来...   \n",
       "9     3486783347798216  【紧急发布】刚收到家长私信：“我实在没粉丝，请各位名人与好心人帮忙转发一下。刘皓曦，男，三岁...   \n",
       "10    3601551484167477  刚才，百度急忙把“抢尸县长”贺遵庆个人资料简历修改了，“把维稳学硕士学位”改成了“硕士学位”...   \n",
       "...                ...                                                ...   \n",
       "4657  3521716896494896  国家最新出台的《新资源食品管理办法》中猫属于界定中的第一项无食用习惯的动物，猫拥有不被食用的...   \n",
       "4660  3495729601661914                   【静安，再见……】【全文】http://t.cn/zl5OXSy   \n",
       "4661  3602611439603519  【是真还是假？】2012年杜春晓以六百多分的成绩考入武汉理工，今年暑假在广水强海制鞋厂做暑期...   \n",
       "4662  3488860002774854  9月10 日。宁波妇儿医院今天下午1点48分，一妇女及一婴儿在住院楼跳楼，后抢救无效死亡。具...   \n",
       "4663  3534383233018984  【杭州出现新公交骗局】杭州最近发生了系列高级骗局，乘公交车的亲人们警惕哦。骗子最新手法：在公...   \n",
       "\n",
       "      cat_id  \n",
       "0          7  \n",
       "3          7  \n",
       "5          7  \n",
       "9          7  \n",
       "10         7  \n",
       "...      ...  \n",
       "4657       7  \n",
       "4660       7  \n",
       "4661       7  \n",
       "4662       7  \n",
       "4663       7  \n",
       "\n",
       "[1920 rows x 3 columns]"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.cat_id==7]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "ddf = pd.read_csv(\"./data/WeiboContent.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "gp = ddf.groupby(['cat_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>content</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cat_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>259</td>\n",
       "      <td>259</td>\n",
       "      <td>259</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>11</td>\n",
       "      <td>11</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>244</td>\n",
       "      <td>244</td>\n",
       "      <td>244</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>931</td>\n",
       "      <td>931</td>\n",
       "      <td>931</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>42</td>\n",
       "      <td>42</td>\n",
       "      <td>42</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>129</td>\n",
       "      <td>129</td>\n",
       "      <td>129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>868</td>\n",
       "      <td>868</td>\n",
       "      <td>868</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1920</td>\n",
       "      <td>1920</td>\n",
       "      <td>1920</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>260</td>\n",
       "      <td>260</td>\n",
       "      <td>260</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          id  content  label\n",
       "cat_id                      \n",
       "0        259      259    259\n",
       "1         11       11     11\n",
       "2        244      244    244\n",
       "3        931      931    931\n",
       "4         42       42     42\n",
       "5        129      129    129\n",
       "6        868      868    868\n",
       "7       1920     1920   1920\n",
       "8        260      260    260"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gp.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> 可以用第5类和第2类做验证和测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cat_id</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.760618</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.454545</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.790984</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.335124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.904762</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.480620</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.317972</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.603646</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.273077</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           label\n",
       "cat_id          \n",
       "0       0.760618\n",
       "1       0.454545\n",
       "2       0.790984\n",
       "3       0.335124\n",
       "4       0.904762\n",
       "5       0.480620\n",
       "6       0.317972\n",
       "7       0.603646\n",
       "8       0.273077"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(gp.sum()/gp.count())[['label']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> 每一类与主题相关的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "tr_ids = pd.concat([ddf[ddf.cat_id == i][['id', 'label']] for i in [1, 2, 3, 4, 6, 7, 8]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4276, 2)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tr_ids.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "tr_ids.to_csv(\"./data/tr_ids.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "dev_ids = ddf[ddf.cat_id == 5][['id', 'label']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "dev_ids.to_csv(\"./data/dev_ids.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>3513862995205247</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>86</th>\n",
       "      <td>3489099614454084</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>160</th>\n",
       "      <td>3650475917913991</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>183</th>\n",
       "      <td>3909978840125858</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>205</th>\n",
       "      <td>3593500484588570</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4514</th>\n",
       "      <td>3918796504841684</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4523</th>\n",
       "      <td>3593779291912843</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4539</th>\n",
       "      <td>3917848134960585</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4594</th>\n",
       "      <td>3570229739002279</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4648</th>\n",
       "      <td>3911529298581977</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>129 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    id\n",
       "45    3513862995205247\n",
       "86    3489099614454084\n",
       "160   3650475917913991\n",
       "183   3909978840125858\n",
       "205   3593500484588570\n",
       "...                ...\n",
       "4514  3918796504841684\n",
       "4523  3593779291912843\n",
       "4539  3917848134960585\n",
       "4594  3570229739002279\n",
       "4648  3911529298581977\n",
       "\n",
       "[129 rows x 1 columns]"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dev_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3910905915934383</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3499245883634226</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3536502098911595</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>3545679399574108</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>3491552036431207</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4624</th>\n",
       "      <td>3580088660229461</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4625</th>\n",
       "      <td>3501410584664439</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4646</th>\n",
       "      <td>3604696122007267</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4650</th>\n",
       "      <td>3625818762514205</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4658</th>\n",
       "      <td>3911968056683495</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>259 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    id  label\n",
       "1     3910905915934383      0\n",
       "4     3499245883634226      1\n",
       "6     3536502098911595      1\n",
       "17    3545679399574108      1\n",
       "20    3491552036431207      1\n",
       "...                ...    ...\n",
       "4624  3580088660229461      1\n",
       "4625  3501410584664439      1\n",
       "4646  3604696122007267      1\n",
       "4650  3625818762514205      1\n",
       "4658  3911968056683495      0\n",
       "\n",
       "[259 rows x 2 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "te_ids = ddf[ddf.cat_id == 0][['id', 'label']]\n",
    "te_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "te_ids.to_csv()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
